import numpy as np
import seaborn as sns
import pandas as pd
import matplotlib.pyplot as plt
import tqdm as tqdm
import sklearn.metrics
import hdbscan
import time
import random
import datetime
%matplotlib inline
from datetime import datetime, timedelta
recon = np.load('/mnt/cube/tsainbur/github_repos/ModelComparisonProject/data/2D_syllable_coordinates/z_values_ST_syllables_silence_just_B1114.npz')
syllable_z = recon['recon_z']
syllable_time = recon['recon_time']
bird_name = recon['recon_name']
specs = recon['all_x']
recon_length = recon['recon_length']
recon_folder = recon['recon_folder']
recon_t_rel_wav = recon['recon_t_rel_wav']
syllable_time = np.array([datetime.strptime(i, "%d/%m/%y %H:%M:%S.%f") for i in syllable_time])
radius = np.array([np.linalg.norm([0,0]-syllable_z[i]) for i in range(len(syllable_z))])
syllable_z_log = np.log(radius)[:,None] / radius[:,None] * syllable_z
BirdData = pd.DataFrame({
'specs':specs.tolist(),
'syllable_z_log':syllable_z_log.tolist(),
'syllable_time':syllable_time.tolist(),
'bird_name':bird_name.tolist(),
'recon_length': recon_length.tolist(),
'recon_folder': recon_folder.tolist(),
'recon_t_rel_wav': recon_t_rel_wav.tolist(),
})
BirdData[0:3]
#test = BirdData[BirdData['bird_name'] == 'b1114'].sort_values(['sequence_num','sequence_syllable'])
#test[0:50]
print "Total Data (Hours): ",np.sum(BirdData['recon_length'])/60/60
length_cutoff = 0.15
fig, ax= plt.subplots(nrows=1,ncols=1,figsize=(16,4))
_ = plt.hist(BirdData['recon_length'],bins=160)
plt.title('Distribution of syllable lengths (seconds)')
plt.axvline(x=length_cutoff, ymin=0, ymax = 1000, color='red')
BirdData = BirdData[BirdData['recon_length'] > length_cutoff]
[(bird, np.sum(BirdData['bird_name'] == bird)) for bird in np.unique(BirdData['bird_name'])]
BirdData.index = range(len(BirdData))
def split_seq_by_time(times, idxs, max_timedelta = 30):
idxs_sorted = idxs[times.argsort()]
times.sort()
time_before = np.concatenate(
([0.],[(times[i] - times[i-1])/np.timedelta64(1, 's')
for i in np.arange(1,len(times))]))
sequence_breaks = np.unique(np.concatenate((
np.where(time_before > max_timedelta)[0], np.array([0,len(times)]))))
idx_seqs = [idxs_sorted[sequence_breaks[i]:sequence_breaks[i+1]] for
i in range(len(sequence_breaks[:-1]))]
return idx_seqs
# maximum amount of time allowed to pass before considering this bout new
max_timedelta = 10.
# label by day, and sequence within day
BirdData['sequence_num'] = -2
BirdData['day_num'] = -2
BirdData['sequence_syllable'] = -2
seq_lengths = []
all_dates = [i.date() for i in BirdData['syllable_time']]
seq_num_tot = 0
for bird in np.unique(bird_name):
"""if bird == 'b1114':
break
else:
continue"""
#For each bird label the day
bird_dates = [i.date() for i in BirdData[BirdData['bird_name'] == bird]['syllable_time']]
for i,date in enumerate(tqdm.tqdm(np.unique(bird_dates))):
#BirdData.loc[np.array((BirdData['bird_name'] == bird) & (np.array(all_dates) == date)), 'day_num'] = i
BirdData.loc[((BirdData['bird_name'] == bird) & (np.array(date) == all_dates)), 'day_num'] = i
# For each bird label the sequence number
bird_times = BirdData[BirdData['bird_name']==bird]['syllable_time']
idx_seqs = split_seq_by_time(np.array(bird_times.values),
np.array(bird_times.index),
max_timedelta=max_timedelta)
for seq_i, idxs in tqdm.tqdm(enumerate(idx_seqs)):
seq_lengths.append(len(idxs))
BirdData.loc[idxs, 'sequence_num'] = seq_num_tot
seq_num_tot+=1
# Label the syllable number
BirdData.loc[BirdData.loc[idxs].sort_values('syllable_time').index, 'sequence_syllable'] = np.arange(len(idxs))
print bird
BirdData[0:3]
syllable_seq = BirdData[BirdData['sequence_num'] == BirdData.loc[np.argmax(BirdData['sequence_syllable']), 'sequence_num']]
syllable_seq = syllable_seq.sort_values('sequence_syllable')
np.array(syllable_seq['syllable_time'])
np.array(syllable_seq['syllable_time'])[0]
bird_times = BirdData[BirdData['bird_name']==bird]['syllable_time']
print bird_times[0:5]
print len(bird_times)
"""def split_seq_by_time(times, idxs, max_timedelta = 30):
idxs_sorted = idxs[times.argsort()]
times.sort()
time_before = np.concatenate(
([0.],[(times[i] - times[i-1])/np.timedelta64(1, 's')
for i in np.arange(1,len(times))]))
sequence_breaks = np.unique(np.concatenate((
np.where(time_before > max_timedelta)[0], np.array([0,len(times)]))))
print sequence_breaks
print times[sequence_breaks[:-1]]
print times[sequence_breaks[:-1]-1]
print time_before[sequence_breaks[:-1]]
print len(sequence_breaks), len(time_before), len(times)
idx_seqs = [idxs_sorted[sequence_breaks[i]:sequence_breaks[i+1]] for
i in range(len(sequence_breaks[:-1]))]
return idx_seqs"""
"""[ len(i) for i in split_seq_by_time(np.array(bird_times.values),
np.array(bird_times.index),
max_timedelta=.01)]"""
fig, ax= plt.subplots(nrows=1,ncols=1,figsize=(16,4))
_ = plt.hist(seq_lengths, bins = 100)
plt.title('Distribution of sequence lengths (in syllables)')
BirdData[0:3]
#BirdData[BirdData['bird_name'] == bird]
[(bird, len(np.unique(BirdData[BirdData['bird_name'] == bird]['sequence_num']))) for bird in np.unique(BirdData['bird_name'])]
log_z = np.array([i for i in BirdData['syllable_z_log']])
fig, ax= plt.subplots(nrows=1,ncols=1,figsize=(16,16))
ax.scatter(log_z.T[0], log_z.T[1],color='black', alpha = 0.3, linewidth= 0, s=5)
ax.axis('off')
def cluster_data(data, algorithm, args, kwds):
# Function taken from HDBSCAN python package website
start_time = time.time()
labels = algorithm(*args, **kwds).fit_predict(data)
end_time = time.time()
print('Clustering took {:.2f} s'.format(end_time - start_time))
palette = sns.color_palette('husl', np.unique(labels).max() + 1)
random.shuffle(palette)
colors = [palette[x] if x >= 0 else (0.75, 0.75, 0.75) for x in labels]
return labels, palette, colors
# Parameters for clustering
min_cluster_size_hbd = 20 # minimum of N syllables in group to be called a cluster
alpha_hbd= 1.
min_samples_hbd = min_cluster_size_hbd #1
syllable_labels, palette, colors = cluster_data(
data = log_z,
algorithm = hdbscan.HDBSCAN,
args=(),
kwds={'min_cluster_size':min_cluster_size_hbd,'alpha':alpha_hbd, 'min_samples':min_samples_hbd }
)
BirdData['syllable_labels'] = syllable_labels
BirdData['old_labels'] = syllable_labels
import copy
old_labels = copy.deepcopy(syllable_labels)
colors_old = [palette[x] if x >= 0 else (0.75, 0.75, 0.75) for x in np.array(BirdData['old_labels'])]
fig, ax = plt.subplots(nrows=1,ncols=1, figsize=(32,32))
#fig.suptitle('bold figure suptitle', fontsize=14, fontweight='bold')
plot_kwds = {'alpha' : 0.5, 's' : 10, 'linewidths':0}
sns.set_color_codes(palette='dark')
lz = np.array([i for i in BirdData['syllable_z_log'].values])
ax.scatter(lz.T[0], lz.T[1], color=colors_old, **plot_kwds)
#for i in np.unique(old_labels):
# ax.text(np.mean(log_z[old_labels == i].T[0]),
# np.mean(log_z[old_labels == i].T[1]), i,
# fontsize=12, fontweight='bold', alpha=0.5)
ax.axis('off')
from scipy import spatial
#label_means = [np.mean([i for i in BirdData[BirdData['syllable_labels'] == j]['syllable_z_log'].values],axis = 0) for j in np.unique(BirdData['syllable_labels'])][1:]
"""all_labeled_syll = np.array(BirdData[BirdData['syllable_labels'] != -1]['syllable_labels'])
all_labeled_z = BirdData[BirdData['syllable_labels'] != -1]['syllable_z_log']
all_labeled_z = np.array([i[:] for i in all_labeled_z])"""
#all_means = np.array([np.mean(all_labeled_z[syllable_labels == label],axis=0) for label in np.unique(syllable_labels)])[1:]
#np.unique(syllable_labels)[1:]
"""for i, label in enumerate(tqdm.tqdm(BirdData['syllable_labels'])):
if label == -1:
#nearest_neighbor = spatial.KDTree(all_labeled_z).query(BirdData['syllable_z_log'][i])[1]
#nearest_label = all_labeled_syll[nearest_neighbor]
nearest_neighbor = spatial.KDTree(label_means).query(BirdData['syllable_z_log'][i])[1]
BirdData.loc[i,'syllable_labels'] = nearest_neighbor
"""
pct_in_test = .3
num_sequences = len((np.unique(BirdData['sequence_num'])))
num_seq_in_test = int(pct_in_test*num_sequences)
test_sequences = np.random.choice(num_sequences, num_seq_in_test)
BirdData['Holdout'] = 'Training' # set all data to training
BirdData['Holdout'][BirdData['sequence_num'].isin(test_sequences)] = 'Testing'
BirdData.to_pickle('../../../data/Labelled_syllables/ClusteredData_syllables_silence_for_ZEKE.pickle')
#BirdData.to_csv('../../../data/Labelled_syllables/ClusteredData_syllables_silence_test.csv')
#np.unique(BirdData['syllable_labels'])
palette = sns.color_palette('husl', np.unique(np.unique(BirdData['syllable_labels'])).max() + 1)
random.shuffle(palette)
colors = [palette[x] if x >= 0 else (0.75, 0.75, 0.75) for x in np.array(BirdData['syllable_labels'])]
fig, ax = plt.subplots(nrows=1,ncols=1, figsize=(20,20))
#fig.suptitle('bold figure suptitle', fontsize=14, fontweight='bold')
plot_kwds = {'alpha' : 1.0, 's' : 10, 'linewidths':0}
sns.set_color_codes(palette='dark')
lz = np.array([i for i in BirdData['syllable_z_log'].values])
labs = np.array([i for i in BirdData['syllable_labels'].values])
ax.scatter(lz.T[0], lz.T[1], color=colors, **plot_kwds)
ax.axis('off')
fig, ax = plt.subplots(nrows=1,ncols=1, figsize=(20,20))
plot_kwds = {'alpha' : 0.25, 's' : 10, 'linewidths':0}
bird_palette = sns.color_palette('deep', len(np.unique(bird_name)) + 1)
sns.palplot(bird_palette)
bird_name = np.array([i for i in BirdData['bird_name'].values])
for i,bird in enumerate(np.unique(bird_name)):
lz = np.array([j for j in BirdData['syllable_z_log'].values])
#ax.scatter(lz.T[0], lz.T[1], color=colors, **plot_kwds)
ax.scatter(lz[bird_name == bird].T[0], lz[bird_name == bird].T[1], color=bird_palette[i], **plot_kwds)
ax.axis('off')
z_log = np.reshape(np.concatenate(np.array(BirdData['syllable_z_log'])), (len(BirdData['syllable_z_log']),2 ))
fig, ax = plt.subplots(nrows=1,ncols=1, figsize=(20,20))
plot_kwds = {'alpha' : 0.5, 's' : 10, 'linewidths':0}
#ax.scatter(lz.T[0], lz.T[1], color=colors, **plot_kwds)
ax.scatter(z_log[:,0], z_log[:,1],
c=np.log(np.array(BirdData['recon_length'])),cmap='viridis', **plot_kwds)
ax.axis('off')
ax.set_title('Syllable as a function of length')
# Make an alpha based color scheme
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
from matplotlib.cbook import get_sample_data
from matplotlib.colors import LinearSegmentedColormap
def imscatter(x, y, image, ax=None, zoom=1):
if ax is None:
ax = plt.gca()
try:
image = plt.imread(image)
except TypeError:
# Likely already an array...
pass
im = OffsetImage(image, zoom=zoom)
x, y = np.atleast_1d(x, y)
artists = []
for x0, y0 in zip(x, y):
ab = AnnotationBbox(im, (x0, y0), xycoords='data', frameon=False)
artists.append(ax.add_artist(ab))
ax.update_datalim(np.column_stack([x, y]))
ax.autoscale()
return artists
fig, ax = plt.subplots(nrows=1,ncols=1, figsize=(32,32))
#fig.suptitle('bold figure suptitle', fontsize=14, fontweight='bold')
plot_kwds = {'alpha' : 0.5, 's' : 10, 'linewidths':0}
sns.set_color_codes(palette='dark')
lz = np.array([i for i in BirdData['syllable_z_log'].values])
labs = np.array([i for i in BirdData['syllable_labels'].values])
specs = np.array([i for i in BirdData['specs'].values])
ax.scatter(lz.T[0], lz.T[1], color=colors, **plot_kwds)
for label in np.unique(np.unique(labs)):
x0 = np.mean(lz[labs == label].T[0])
y0 = np.mean(lz[labs == label].T[1])
img_3d = plt.cm.afmhot(np.flipud(np.reshape(specs[labs == label][0],(32,32))))
imscatter(x0, y0, img_3d, zoom=1, ax=ax)
ax.axis('off')
ax.set_title('Clustered Syllables')
BirdData[0:3]
unique_labels = np.unique(BirdData['syllable_labels'])
len(unique_labels)
num_cats = len(unique_labels)
num_ex = 20
dim1 = dim2 = 32
canvas = np.zeros((dim1*num_ex, dim2*num_cats))
for ji, j in tqdm.tqdm(enumerate(unique_labels[0:num_cats])):
specs = BirdData[BirdData['old_labels'] == j]['specs'][:num_ex].values
for i in range(num_ex):
if i <= len(specs):
spec = specs[i]
spec = np.reshape(spec, (dim1,dim2))
canvas[i*dim1:i*dim1+dim1,ji*dim2:ji*dim2+dim2] = spec
## TODO: MAKE CANVAS BY BIRD
fig, ax = plt.subplots(nrows=1,ncols=1, figsize=(num_cats, num_ex))
ax.matshow(canvas, aspect='auto',
cmap=plt.cm.afmhot, origin='lower')
ax.axis('off')
plt.show()
# make an interactive 2D interface, where you click on two things and it turns them into one group
num_cats = 20
num_ex = 20
dim1 = dim2 = 32
canvas = np.zeros((dim1*num_ex, dim2*num_cats))
for ji, j in tqdm.tqdm(enumerate(unique_labels[0:num_cats])):
specs = BirdData[BirdData['bird_name'] == 'b1080']['specs'][ji*num_ex:((ji+1)*(num_ex))].values
for i in range(num_ex):
if i <= len(specs):
spec = specs[i]
spec = np.reshape(spec, (dim1,dim2))
canvas[i*dim1:i*dim1+dim1,ji*dim2:ji*dim2+dim2] = spec
fig, ax = plt.subplots(nrows=1,ncols=1, figsize=(num_cats, num_ex))
ax.matshow(canvas, aspect='auto',
cmap=plt.cm.afmhot, origin='lower')
ax.axis('off')
plt.show()
np.shape(specs)
iii